library(biomaRt)
package ‘biomaRt’ was built under R version 3.6.3Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
library(DESeq2)
Loading required package: S4Vectors
package ‘S4Vectors’ was built under R version 3.6.3Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply,
parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname,
do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int,
pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which, which.max, which.min
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
package ‘GenomeInfoDb’ was built under R version 3.6.3Loading required package: SummarizedExperiment
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite
Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: DelayedArray
package ‘DelayedArray’ was built under R version 3.6.3Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
Loading required package: BiocParallel
Attaching package: ‘DelayedArray’
The following objects are masked from ‘package:matrixStats’:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(sleuth)
library(tximport)
package ‘tximport’ was built under R version 3.6.3
library(jsonlite)
library(data.table)
data.table 1.12.8 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following object is masked from ‘package:SummarizedExperiment’:
shift
The following object is masked from ‘package:GenomicRanges’:
shift
The following object is masked from ‘package:IRanges’:
shift
The following objects are masked from ‘package:S4Vectors’:
first, second
library(vsn)
library(EnhancedVolcano)
Loading required package: ggplot2
Loading required package: ggrepel
library(viridis)
Loading required package: viridisLite
library(ggpubr)
Loading required package: magrittr
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:biomaRt’:
select
library(rtracklayer)
library(VennDiagram)
Loading required package: grid
Loading required package: futile.logger
Attaching package: ‘VennDiagram’
The following object is masked from ‘package:ggpubr’:
rotate
library(RColorBrewer)
library(ComplexHeatmap)
========================================
ComplexHeatmap version 2.2.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
genomic data. Bioinformatics 2016.
========================================
library(RColorBrewer)
library(circlize)
========================================
circlize version 0.4.8
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: http://jokergoo.github.io/circlize_book/book/
If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
in R. Bioinformatics 2014.
========================================
library(tidyverse)
[37m── [1mAttaching packages[22m ────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[37m[32m✓[37m [34mtibble [37m 3.0.1 [32m✓[37m [34mdplyr [37m 0.8.5
[32m✓[37m [34mtidyr [37m 1.0.2 [32m✓[37m [34mstringr[37m 1.4.0
[32m✓[37m [34mreadr [37m 1.3.1 [32m✓[37m [34mforcats[37m 0.5.0
[32m✓[37m [34mpurrr [37m 0.3.4 [39m
[37m── [1mConflicts[22m ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[37m [34mdplyr[37m::[32mbetween()[37m masks [34mdata.table[37m::between()
[31mx[37m [34mdplyr[37m::[32mcollapse()[37m masks [34mIRanges[37m::collapse()
[31mx[37m [34mdplyr[37m::[32mcombine()[37m masks [34mBiobase[37m::combine(), [34mBiocGenerics[37m::combine()
[31mx[37m [34mdplyr[37m::[32mcount()[37m masks [34mmatrixStats[37m::count()
[31mx[37m [34mdplyr[37m::[32mdesc()[37m masks [34mIRanges[37m::desc()
[31mx[37m [34mtidyr[37m::[32mexpand()[37m masks [34mS4Vectors[37m::expand()
[31mx[37m [34mtidyr[37m::[32mextract()[37m masks [34mmagrittr[37m::extract()
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mfirst()[37m masks [34mdata.table[37m::first(), [34mS4Vectors[37m::first()
[31mx[37m [34mpurrr[37m::[32mflatten()[37m masks [34mjsonlite[37m::flatten()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()
[31mx[37m [34mdplyr[37m::[32mlast()[37m masks [34mdata.table[37m::last()
[31mx[37m [34mggplot2[37m::[32mPosition()[37m masks [34mBiocGenerics[37m::Position(), [34mbase[37m::Position()
[31mx[37m [34mpurrr[37m::[32mreduce()[37m masks [34mGenomicRanges[37m::reduce(), [34mIRanges[37m::reduce()
[31mx[37m [34mdplyr[37m::[32mrename()[37m masks [34mS4Vectors[37m::rename()
[31mx[37m [34mdplyr[37m::[32mselect()[37m masks [34mMASS[37m::select(), [34mbiomaRt[37m::select()
[31mx[37m [34mpurrr[37m::[32mset_names()[37m masks [34mmagrittr[37m::set_names()
[31mx[37m [34mpurrr[37m::[32msimplify()[37m masks [34mDelayedArray[37m::simplify()
[31mx[37m [34mdplyr[37m::[32mslice()[37m masks [34mIRanges[37m::slice()
[31mx[37m [34mpurrr[37m::[32mtranspose()[37m masks [34mdata.table[37m::transpose()[39m
set.seed(42)
RESPONSE_COLORS = c("NR"="purple","R"="orange")
# Load in samples and point to kallisto paths
samples <- read_tsv("../work/input/sample_manifest.tsv")
Parsed with column specification:
cols(
`Patient ID` = [31mcol_character()[39m,
`GSM ID` = [31mcol_character()[39m,
Response = [31mcol_character()[39m,
SRR = [31mcol_character()[39m
)
samples$kallisto_path <- file.path("..","work","intermediate","quant",samples$SRR)
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = 'grch37.ensembl.org')
gene_features <- biomaRt::getBM(attributes = c("ensembl_transcript_id_version", "ensembl_gene_id",
"external_gene_name","transcript_length"), mart = mart)
Cache found
t2g <- gene_features %>%
select(ensembl_transcript_id_version,ensembl_gene_id,external_gene_name)
avg_tx_len <-gene_features %>%
group_by(external_gene_name) %>%
summarise(length=mean(transcript_length))
We’ll filter high quality samples by percent reads mapped by Kallisto. We’ll need to get hte total reads in files from the kallisto logs, and the total number of mapped reads by pre-loading all the counts using sleuth.
# Find total reads to compute percent mapped
samples$n_reads <- apply(samples,1,function(x)read_json(file.path(x["kallisto_path"],"run_info.json"))$n_processed)
# Initial sleuth just to get kallisto counts
s_obj_counts <- sleuth_prep(
sample_to_covariates = rename(samples,path=kallisto_path,sample=SRR),
normalize=FALSE
)
It appears that you are running Sleuth from within Rstudio.
Because of concerns with forking processes from a GUI, 'num_cores' is being set to 1.
If you wish to take advantage of multiple cores, please consider running sleuth from the command line.reading in kallisto results
dropping unused factor levels
................
86131 targets passed the filter
kallisto_est_mapped <- s_obj_counts$obs_raw %>%
group_by(sample) %>%
summarise(kallisto_mapped_reads=sum(est_counts))
samples <- samples %>%
left_join(kallisto_est_mapped,by=c("SRR"="sample")) %>%
mutate(percent_mapped=kallisto_mapped_reads/n_reads)
samples %>%
ggplot(aes(x = percent_mapped)) +
geom_histogram(bins = 10) +
labs(title = "Distribution of % Mapped Reads", x = "% Mapped Reads")
# Filter samples
samples_filt <-samples %>%
filter(
!SRR %in% c("SRR8526726"),
#percent_mapped > 0.8
)
plot_pca(s_obj,units="scaled_reads_per_base",color_by = "Response",use_filtered = TRUE,text_labels = F) + geom_label(aes(label=sample),size=2)
sleuth_rna_norm <- s_obj$obs_norm_filt %>%
select(sample,target_id,scaled_reads_per_base) %>%
spread(key=target_id,value=scaled_reads_per_base) %>%
column_to_rownames("sample")
sleuth_std_pca <- prcomp((sleuth_rna_norm),center=T,scale.=T)
sleuth_variance_explained = summary(sleuth_std_pca)$importance[2,1:2]
sleuth_std_pca$x[,1:2] %>%
as.data.frame() %>%
rownames_to_column("SRR") %>%
left_join(samples,by="SRR") %>%
ggplot(aes(x=PC1,y=PC2,color=Response,label=SRR)) +
geom_point(size=3) +
labs(title="PCA Plot of Sleuth Normalized Kallisto Quantified Data",
x = paste0("PC1: ", round(sleuth_variance_explained[1]*100),"% Variance"),
y = paste0("PC2: ", round(sleuth_variance_explained[2]*100),"% Variance")) +
scale_color_manual(values=RESPONSE_COLORS)
s_obj <- s_obj %>%
sleuth_fit(~Response,"full") %>%
sleuth_fit(~1,"reduced") %>%
sleuth_lrt("reduced","full")
fitting measurement error models
shrinkage estimation
computing variance of betas
fitting measurement error models
shrinkage estimation
computing variance of betas
sleuth_diffex_lrt <- sleuth_results(s_obj,test="reduced:full",test_type="lrt") %>% filter(!is.na(pval))
sleuth_diffex_lrt %>% arrange(qval)
s_obj <- sleuth_wt(s_obj,"ResponseR")
sleuth_diffex_wald <- sleuth_results(s_obj,test="ResponseR",test_type="wald") %>% filter(!is.na(pval))
sleuth_diffex_wald %>% arrange(qval)
plot_volcano(s_obj,test_type="wt",test="ResponseR",sig_level=0.05) +
labs(title="Volcano Plot of Sleuth Identified Differentially Expressed Genes",
x="log2FC", y = "qval",
color="Q < 0.05")
meanSdPlot(log2(sleuth_to_matrix(s_obj,"obs_norm","scaled_reads_per_base") + 1),rank=FALSE,bins=100)
files <- file.path(samples_filt$kallisto_path,"abundance.h5")
names(files) <- samples_filt$`Patient ID`
txi <- tximport(
file.path(samples_filt$kallisto_path,"abundance.h5"),
type = "kallisto",
tx2gene = select(
t2g,
TXNAME = ensembl_transcript_id_version,
GENEID = external_gene_name
),
ignoreAfterBar = TRUE,
)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates
rownames(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
deseq_kallisto_data <- DESeqDataSetFromTximport(txi,samples_filt,~Response)
some variables in design formula are characters, converting to factorsusing counts and average transcript lengths from tximport
kallisto_keep <- rowSums(counts(deseq_kallisto_data)>5)>5
deseq_kallisto_data <- deseq_kallisto_data[kallisto_keep,]
deseq_kallisto_norm <- vst(deseq_kallisto_data)
using 'avgTxLength' from assays(dds), correcting for library size
meanSdPlot(assay(deseq_kallisto_norm),ranks=FALSE,bins=100)
plotPCA(deseq_kallisto_norm,intgroup="Response",ntop=500000) +
labs(title="PCA Plot of DESeq Normalized Kallisto Quantified Data",
color="Response") +
scale_color_manual(values=RESPONSE_COLORS)
deseq_kallisto_data <- DESeq(deseq_kallisto_data)
estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1306 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
deseq_kallisto_lrt <- results(deseq_kallisto_data,name="Response_R_vs_NR",tidy=TRUE)
deseq_kallisto_lrt %>%
#filter(padj < 0.05) %>%
arrange(padj) %>%
select(row,log2FoldChange,pvalue,padj) %>%
left_join(select(t2g,ensembl_gene_id,external_gene_name),by=c("row"="ensembl_gene_id"))
fc_mat <- read_tsv("../work/input/GSE126044_counts.txt")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [31mcol_character()[39m,
Dis_01 = [32mcol_double()[39m,
Dis_02 = [32mcol_double()[39m,
Dis_11 = [32mcol_double()[39m,
Dis_12 = [32mcol_double()[39m,
Dis_15 = [32mcol_double()[39m,
Dis_04 = [32mcol_double()[39m,
Dis_16 = [32mcol_double()[39m,
Dis_17 = [32mcol_double()[39m,
Dis_03 = [32mcol_double()[39m,
Dis_07 = [32mcol_double()[39m,
Dis_10 = [32mcol_double()[39m,
Dis_18 = [32mcol_double()[39m,
Dis_09 = [32mcol_double()[39m,
Dis_05 = [32mcol_double()[39m,
Dis_06 = [32mcol_double()[39m,
Dis_08 = [32mcol_double()[39m
)
fc_mat <- fc_mat[!grepl("^\\d+\\-",fc_mat$X1),] %>%
column_to_rownames("X1")
fc_mat_filt <- fc_mat %>%
select(one_of(samples_filt$`Patient ID`))
deseq_fc <- DESeqDataSetFromMatrix(countData=fc_mat_filt,colData=samples_filt,design=~Response)
converting counts to integer mode
some variables in design formula are characters, converting to factors
fc_keep <- rowSums(counts(deseq_fc)>5)>5
deseq_fc <- deseq_fc[fc_keep,]
deseq_fc_norm <- vst(deseq_fc)
meanSdPlot(assay(deseq_fc_norm),rank=FALSE,bins=100)
plotPCA(deseq_fc_norm,intgroup="Response",ntop=500000) +
labs(title="PCA Plot of DESeq Normalized Feature Counts/STAR Quantified Data",
color="Response") +
scale_color_manual(values=RESPONSE_COLORS)
tfc <- t(fc_mat_filt)
# Convert Kallisto counts to rpm
kalread<-cbind(kallisto_mapped_reads=samples_filt$kallisto_mapped_reads, sleuth_rna_norm %>% rownames_to_column("sample"))
kalrpm<-setDT(kalread) [, lapply(.SD, function(x) (x * 1e6) / (kallisto_mapped_reads)), by=.(kallisto_mapped_reads,sample)] %>%
column_to_rownames("sample")
# Convert star fc to rpm
#get df with est total reads for star
staread<-cbind(star_est_mapped=colSums(fc_mat_filt), as.data.frame(tfc) %>% rownames_to_column("sample"))
starpm<-setDT(staread) [, lapply(.SD, function(x) (x * 1e6) / (star_est_mapped)), by=.(star_est_mapped,sample)] %>%
column_to_rownames("sample")
## scale pca with rpm
kal_rpm_pca <- prcomp(kalrpm[,-1],center=TRUE,scale.=TRUE)
starnorm_std <- scale(starpm[,-1])
#star counts include some zero, set scale/std apply to those !=0
star_rpm_pca <- prcomp(starnorm_std[ , which(apply(starnorm_std, 2, var) != 0)],center=TRUE)
## superimposed pca plots from the two sources
p1 <-kal_rpm_pca$x[,1:2]%>%
as.data.frame() %>%
rownames_to_column("SRR") %>%
left_join(samples_filt,by="SRR")
p2<-star_rpm_pca$x[,1:2] %>%
as.data.frame() %>%
rownames_to_column("Patient ID") %>%
left_join(samples_filt,by="Patient ID")
bind_rows(Kallisto=p1,STAR=p2,.id="quant_type") %>%
ggplot(aes(x=PC1,y=PC2,color=Response,shape=quant_type)) +
geom_point(size=3) +
scale_color_manual(values=RESPONSE_COLORS) +
labs(title="Superimposed PCA of RPM Normalized Counts",
shape="Method")
equation = function(x) {
lm_coef <- list(a = round(as.numeric(coef(x)[1]), digits = 2),
b = round(as.numeric(coef(x)[2]), digits = 2),
r2 = round(summary(x)$r.squared, digits = 2));
lm_eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,lm_coef)
as.character(as.expression(lm_eq));
}
log2fc_combined <- sleuth_diffex_wald %>%
filter(!is.na(b)) %>%
select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
left_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
left_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row")) %>%
filter(!is.na(deseq_fc_log2FC))
regression.line <- lm(deseq_kal_log2FC ~ sleuth_log2FC, data = log2fc_combined)
log2fc_combined %>%
ggplot(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
geom_point(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
geom_hline(yintercept=0, color="darkgray") +
geom_vline(xintercept=0, color="darkgray") +
theme_pubr(base_size = 14)+
geom_smooth(method = "lm", color="red") +
annotate("text", label = equation(regression.line), x = -3, y = 4, parse = TRUE)+
labs(title="Comparison of log2FC between Sleuth and DESeq Kallisto",
x="log2 Fold Change (Sleuth-Kallisto)",y="log2 Fold Change (DESeq-Kallisto)")
regression.line <- lm(deseq_fc_log2FC ~ deseq_kal_log2FC, data = log2fc_combined)
log2fc_combined %>%
ggplot(aes(x=deseq_kal_log2FC,y=deseq_fc_log2FC)) +
geom_point() +
geom_hline(yintercept=0, color="darkgray") +
geom_vline(xintercept=0, color="darkgray") +
theme_pubr(base_size = 14)+
geom_smooth(method = "lm", color="red") +
annotate("text", label = equation(regression.line), x = -4, y = 5, parse = TRUE)+
labs(title="Comparison of log2FC between DESeq From Kallisto and FC",
x="log2 Fold Change (DESeq-FeatureCounts)",y="log2 Fold Change (DESeq-Kallisto)")
sleuth_diffex_wald <- sleuth_diffex_wald %>% filter(!is.na(b))
deseq_fc_lrt <- deseq_fc_lrt %>% filter(!is.na(log2FoldChange))
deseq_kallisto_lrt <- deseq_kallisto_lrt %>% filter(!is.na(log2FoldChange))
log2fc_full_combined <- sleuth_diffex_wald %>%
select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
full_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
full_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row"))
log2fc_full_combined
NA
log2fc_full_combined$log2FoldChange <- log2fc_full_combined$sleuth_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$sleuth_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange))
thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))
keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'
keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'
down.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -3))
up.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 3))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 27
table(keyvals.color)
keyvals.color
green red royalblue
16210 1794 1338
EnhancedVolcano(log2fc_final,
lab = (log2fc_final$gene),
x = 'log2FoldChange',
y = 'padj',
title = 'R vs. NR - sleuth',
drawConnectors = TRUE,
widthConnectors = 0.3,
colConnectors = 'grey30',
colCustom = keyvals.color,
pCutoff = NA,
cutoffLineType = 'blank',
vline = c(-thres,thres),
vlineCol = c('grey50', 'grey50'),
gridlines.major = FALSE,
gridlines.minor = FALSE,
ylim = c(0,1.5),
selectLab = log2fc_final$gene[up.down.genes]
)
log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_kal_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_kal_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange))
thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))
keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'
keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'
down.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -20))
up.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 5))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 6
table(keyvals.color)
keyvals.color
green red royalblue
16311 2059 2369
EnhancedVolcano(log2fc_final,
lab = (log2fc_final$gene),
x = 'log2FoldChange',
y = 'padj',
title = 'R vs. NR - Deseq-kallisto',
drawConnectors = TRUE,
widthConnectors = 0.3,
colConnectors = 'grey30',
colCustom = keyvals.color,
pCutoff = NA,
cutoffLineType = 'blank',
vline = c(-thres,thres),
vlineCol = c('grey50', 'grey50'),
gridlines.major = FALSE,
gridlines.minor = FALSE,
ylim = c(0,10),
selectLab = log2fc_final$gene[up.down.genes]
)
log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_fc_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_fc_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange))
thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))
keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'
keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'
down.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -8))
up.genes <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 4))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 9
table(keyvals.color)
keyvals.color
green red royalblue
12879 1621 1650
EnhancedVolcano(log2fc_final,
lab = (log2fc_final$gene),
x = 'log2FoldChange',
y = 'padj',
title = 'R vs. NR - Deseq-fc',
drawConnectors = TRUE,
widthConnectors = 0.3,
colConnectors = 'grey30',
colCustom = keyvals.color,
pCutoff = NA,
cutoffLineType = 'blank',
vline = c(-thres,thres),
vlineCol = c('grey50', 'grey50'),
gridlines.major = FALSE,
gridlines.minor = FALSE,
ylim = c(0,15),
selectLab = log2fc_final$gene[up.down.genes]
)
thres <- 1
# get the name of the upregulated/downregulated genes from 3 different methods
sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
deseq.k.up <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange >= thres))]
deseq.k.down <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange <= -thres))]
deseq.fc.up <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange >= thres))]
deseq.fc.down <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange <= -thres))]
# up-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.up),
"DESeq-Kallisto" = (deseq.k.up),
"DESeq-FeatureCounts" = (deseq.fc.up)),
main="Venn Diagram of Upregulated Genes Between 3 Methods",
fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)
# down-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.down),
"DESeq-Kallisto" = (deseq.k.down),
"DESeq-FeatureCounts" = (deseq.fc.down)),
main="Venn Diagram of Downregulated Genes Between 3 Methods",
fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)
#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
normalized_counts <- counts(deseq_kallisto_data, normalized=TRUE)
colnames(normalized_counts)
[1] "Dis_01" "Dis_02" "Dis_03" "Dis_04" "Dis_05" "Dis_06" "Dis_07" "Dis_08" "Dis_09" "Dis_11"
[11] "Dis_12" "Dis_15" "Dis_16" "Dis_17" "Dis_18"
updown.gene <- deseq_kallisto_lrt[which((deseq_kallisto_lrt$log2FoldChange > 1 | deseq_kallisto_lrt$log2FoldChange < -1) & deseq_kallisto_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")
updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])
heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]
zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)
col_updown = c("up" = "pink", "down" = "purple")
row.names(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
samples_filt
ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",
col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)
[1] 20739 15
ht1 = Heatmap(zscore_data, name = "expression",
col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
show_column_names = FALSE, show_row_names=FALSE,
row_title = NULL,
bottom_annotation = ha,
heatmap_width = 400, heatmap_height = 100)
ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)
#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
normalized_counts <- counts(deseq_fc, normalized=TRUE)
updown.gene <- deseq_fc_lrt[which((deseq_fc_lrt$log2FoldChange > 1 | deseq_fc_lrt$log2FoldChange < -1) & deseq_fc_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")
updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])
heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]
zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)
col_updown = c("up" = "pink", "down" = "purple")
row.names(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
samples_filt
ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",
col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)
[1] 16150 15
ht1 = Heatmap(zscore_data, name = "expression",
col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
show_column_names = FALSE, show_row_names=FALSE,
row_title = NULL,
bottom_annotation = ha,
heatmap_width = 400, heatmap_height = 100)
ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)
sample_summary <- data.frame(fc_mapped_reads=colSums(fc_mat)) %>%
rownames_to_column("Patient ID") %>%
left_join(samples,by="Patient ID") %>%
select(`Patient ID`,SRR,Response,"Sequenced Reads"=n_reads,"Mapped Reads(Kallisto)"=kallisto_mapped_reads,"Mapped Reads(STAR/FeatureCounts)"=fc_mapped_reads)
sample_summary
sleuth_diffex_lrt %>% write_tsv("sleuth_diffex_lrt.tsv")
sleuth_diffex_wald %>% write_tsv("sleuth_diffex_wald.tsv")
deseq_kallisto_lrt %>% write_tsv("deseq_kallisto_lrt.tsv")
deseq_fc_lrt %>% write_tsv("deseq_fc_lrt.tsv")
sample_summary%>% write_tsv("sample_summary.tsv")